Analyze differentially co-expressed genes.
Based on this tutorial: https://deisygysi.github.io/teaching/minipeb2021/
library(data.table)
library(dplyr)
library(igraph)
library(ggalluvial)
library(ggplot2)
require(magrittr)
library(tidyr)
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)
plots_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots"
tables_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables"
Prepare the disease-gene associations:
disease_genes_info_file = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/disease_genes/disease_genes_info_2022.csv"
if(!(file.exists(disease_genes_info_file))){
gene_associations_dir = '/work/ccnr/j.aguirreplans/data/gene_associations/data/out'
disease_genes_file = paste(gene_associations_dir, 'GDA_Filtered_04042022.csv', sep='/')
GDA = fread(disease_genes_file) %>% unique()
GDA$HGNC_Symbol = toupper(GDA$HGNC_Symbol)
GDA[is.na(GDA)]<-0
GDA$NewName = tolower(GDA$NewName)
GDA$DescriptorName = tolower(GDA$DescriptorName)
GDA$MESHID = toupper(GDA$MESHID)
GDA$HGNC_Symbol = toupper(GDA$HGNC_Symbol)
GDA %<>%
filter(Strong > 0 |
Weak > 0 |
Incompatible > 0) %>%
mutate(DiseaseName = NewName) %>%
filter(DescriptorName %ni% c("behavioral disciplines and activities")) %>%
mutate(MESHID_remove = stringr::str_detect(TreeNumber, pattern = "C22|C25|F04")) %>%
mutate(MESHID_remove = ifelse(TreeNumber == "F01", TRUE,MESHID_remove)) %>%
filter(!(MESHID_remove)) %>%
select(DiseaseName, HGNC_Symbol, DescriptorName) %>%
unique()
# Make columns without special characters
GDA$DiseaseName.no.sp.char <- gsub(' ', '.', gsub(', ', '.', gsub('-', '.', gsub('[\\(\\)]', '', GDA$DiseaseName))))
GDA$DescriptorName.no.sp.char <- gsub(' ', '.', gsub(', ', '.', gsub('-', '.', gsub('[\\(\\)]', '', GDA$DescriptorName))))
# Write output file
GDA %>% fwrite(disease_genes_info_file)
} else {
GDA = fread(disease_genes_info_file)
}
head(GDA)
## DiseaseName HGNC_Symbol
## 1: hepatomegaly A1BG
## 2: schizophrenia A1BG
## 3: acute kidney injury A2M
## 4: adenoma liver cell A2M
## 5: alzheimer disease A2M
## 6: carcinoma hepatocellular A2M
## DescriptorName
## 1: digestive system diseases|pathological conditions, signs and symptoms
## 2: mental disorders
## 3: female urogenital diseases and pregnancy complications|male urogenital diseases
## 4: digestive system diseases|neoplasms
## 5: mental disorders|nervous system diseases
## 6: digestive system diseases|neoplasms
## DiseaseName.no.sp.char
## 1: hepatomegaly
## 2: schizophrenia
## 3: acute.kidney.injury
## 4: adenoma.liver.cell
## 5: alzheimer.disease
## 6: carcinoma.hepatocellular
## DescriptorName.no.sp.char
## 1: digestive.system.diseases|pathological.conditions.signs.and.symptoms
## 2: mental.disorders
## 3: female.urogenital.diseases.and.pregnancy.complications|male.urogenital.diseases
## 4: digestive.system.diseases|neoplasms
## 5: mental.disorders|nervous.system.diseases
## 6: digestive.system.diseases|neoplasms
There are 65425 disease-gene associations (870 diseases, 14833 genes).
Filter disease-gene associations by the disease “breast neoplasms”:
disease_name = "breast.neoplasms"
GDA_breast = GDA %>% select("DiseaseName.no.sp.char", "HGNC_Symbol") %>% filter(DiseaseName.no.sp.char == disease_name)
dim(GDA_breast)
## [1] 1905 2
head(GDA_breast)
## DiseaseName.no.sp.char HGNC_Symbol
## 1: breast.neoplasms ABCB1
## 2: breast.neoplasms ABCC1
## 3: breast.neoplasms ABCG2
## 4: breast.neoplasms ABL1
## 5: breast.neoplasms ACHE
## 6: breast.neoplasms ACTA2
Compile the files of the differential co-expression analysis from CODINA classifying the nodes:
input_dir = "/work/ccnr/j.aguirreplans/Scipher/SampleSize/differential_coexpression_analysis/TCGA-BRCA_Breast.Mammary.Tissue"
result_files = list.files(input_dir)
cols = c("file_name", "size", "rep_D", "rep_N")
file_info_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for(file_name in result_files){
info_file = gsub(".txt","",file_name)
file_split1 = strsplit(info_file, split="_")[[1]]
if(file_split1[1] == "diffanalysis"){
type_analysis = file_split1[2]
if(type_analysis == "nodes"){
pval = tail(file_split1, n=1)
info_file = gsub(paste("diffanalysis_", type_analysis, "_", sep=""),"",info_file)
info_file = gsub(paste("_pval_", pval, sep=""),"",info_file)
file_split2 = strsplit(info_file, split="___")[[1]]
file_D_split = strsplit(gsub(".net","",file_split2[1]), split="_")[[1]]
file_N_split = strsplit(gsub(".net","",file_split2[2]), split="_")[[1]]
method = file_D_split[1]
size = as.numeric(file_D_split[(length(file_D_split)-2)])
r_D = as.numeric(file_D_split[(length(file_D_split))])
r_N = as.numeric(file_N_split[(length(file_N_split))])
dataset_D = file_D_split[(length(file_D_split)-4)]
dataset_N = file_N_split[(length(file_N_split)-4)]
file_info_df <- rbind(file_info_df, data.frame(file_name=file_name, size=size, rep_D=r_D, rep_N=r_N))
}
}
}
head(file_info_df)
## file_name
## 1 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_1.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_1.net_pval_0.05.txt
## 2 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_3.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_3.net_pval_0.05.txt
## 3 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_4.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_4.net_pval_0.05.txt
## 4 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_5.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_5.net_pval_0.05.txt
## 5 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_120_rep_2.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_120_rep_2.net_pval_0.05.txt
## 6 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_120_rep_3.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_120_rep_3.net_pval_0.05.txt
## size rep_D rep_N
## 1 100 1 1
## 2 100 3 3
## 3 100 4 4
## 4 100 5 5
## 5 120 2 2
## 6 120 3 3
Read the files of the differential co-expression analysis from CODINA classifying the nodes, and put them together in a single table:
pval_correction_field = "pval_Phi_Tilde.adj.fdr"
pval_threshold = 0.05
cols = c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N")
nodes_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
sizes_list = sort(unique(file_info_df$size))
for(x in seq(sizes_list)){
size = sizes_list[x]
#print(size)
same_size_files_df = file_info_df %>% filter(size==!!size) %>% arrange(rep_D, rep_N)
input_files = same_size_files_df$file_name
# same_size_info_df = data.frame()
for(input_file in input_files){
r_D = (same_size_files_df %>% filter(file_name == input_file))$rep_D
r_N = (same_size_files_df %>% filter(file_name == input_file))$rep_N
individual_df = fread(paste(input_dir, input_file, sep="/"))
node_results_filt_df = individual_df %>%
select(Node, Phi_tilde, !!as.symbol(pval_correction_field)) %>%
unique() %>%
rename("pval" = !!as.symbol(pval_correction_field)) %>%
left_join(GDA_breast, by=c("Node"="HGNC_Symbol"))
if(nrow(nodes_df) == 0){
nodes_df = cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N))
} else {
nodes_df = nodes_df %>% full_join(cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N)), by=c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N"))
}
}
}
# Define Phi_name
nodes_df$Phi_name = ifelse(nodes_df$Phi_tilde == "a", "common", ifelse(nodes_df$Phi_tilde == "b.D", "different", ifelse(nodes_df$Phi_tilde == "g.D", "disease-specific", ifelse(nodes_df$Phi_tilde == "g.N", "normal-specific", "undefined"))))
head(nodes_df)
## Node DiseaseName.no.sp.char Phi_tilde pval size rep_D rep_N
## 1: AAAS <NA> g.N 5.213856e-12 20 1 1
## 2: AACS <NA> g.N 9.553415e-07 20 1 1
## 3: AAGAB <NA> U 3.745062e-01 20 1 1
## 4: AAR2 <NA> g.D 7.709075e-03 20 1 1
## 5: AARD <NA> U 3.745062e-01 20 1 1
## 6: AARS2 <NA> g.N 1.835766e-02 20 1 1
## Phi_name
## 1: normal-specific
## 2: normal-specific
## 3: undefined
## 4: disease-specific
## 5: undefined
## 6: normal-specific
We consider as “undefined” both the genes that are classified as U, but also the ones that are not significantly classified in any of the categories. To do so, we need create an empty table with all considered genes for all sizes and repetitions, and fill it with the data that we already have:
reps = sort(unique(paste(nodes_df$size, nodes_df$rep_D, nodes_df$rep_N, sep="-")))
nodes_empty_df = expand.grid(sort(unique(nodes_df$Node)), reps)
nodes_empty_df = nodes_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(nodes_empty_df) = c("Node", "size", "rep_D", "rep_N")
nodes_empty_df = nodes_empty_df %>% left_join((nodes_df %>% select(Node, DiseaseName.no.sp.char) %>% unique()), by="Node") %>% arrange(Node, size, rep_D, rep_N)
nodes_expanded_df = nodes_df %>% full_join(nodes_empty_df, by = c("Node", "DiseaseName.no.sp.char", "size", "rep_D", "rep_N")) %>% arrange(Node, size, rep_D)
nodes_expanded_df = nodes_expanded_df %>%
mutate(Phi_tilde = replace(Phi_tilde, is.na(Phi_tilde), "U")) %>%
mutate(Phi_name = replace(Phi_name, is.na(Phi_name), "undefined"))
nodes_expanded_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes.txt", sep="/")
nodes_expanded_df %>% fwrite(nodes_expanded_file)
head(nodes_expanded_df)
## Node DiseaseName.no.sp.char Phi_tilde pval size rep_D rep_N
## 1: A2M breast.neoplasms U NA 20 1 1
## 2: A2M breast.neoplasms U NA 20 2 2
## 3: A2M breast.neoplasms g.D 5.308776e-07 20 3 3
## 4: A2M breast.neoplasms U NA 20 4 4
## 5: A2M breast.neoplasms U NA 20 5 5
## 6: A2M breast.neoplasms g.N 1.259754e-06 40 1 1
## Phi_name
## 1: undefined
## 2: undefined
## 3: disease-specific
## 4: undefined
## 5: undefined
## 6: normal-specific
We count how many times each gene category appears in each sample size:
# Count for each gene and size the number of times that each category is predicted
nodes_counted_df = nodes_expanded_df %>%
group_by(Node, DiseaseName.no.sp.char, Phi_tilde, Phi_name, size) %>%
summarize(n_phi = n()) %>%
ungroup() %>%
arrange(Node, size)
## `summarise()` has grouped output by 'Node', 'DiseaseName.no.sp.char',
## 'Phi_tilde', 'Phi_name'. You can override using the `.groups` argument.
nodes_counted_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes_counted.txt", sep="/")
nodes_counted_df %>% fwrite(nodes_counted_file)
head(nodes_counted_df)
## # A tibble: 6 × 6
## Node DiseaseName.no.sp.char Phi_tilde Phi_name size n_phi
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 A2M breast.neoplasms g.D disease-specific 20 1
## 2 A2M breast.neoplasms U undefined 20 4
## 3 A2M breast.neoplasms g.N normal-specific 40 1
## 4 A2M breast.neoplasms U undefined 40 4
## 5 A2M breast.neoplasms g.D disease-specific 60 3
## 6 A2M breast.neoplasms g.N normal-specific 60 1
We select the gene category that appears more for in the different repetitions of each sample size:
# Select the for each gene and size the node type that occurs more times in the different repetitions
nodes_filtered_df = nodes_counted_df %>%
group_by(Node, DiseaseName.no.sp.char, size) %>%
filter(n_phi == max(n_phi)) %>%
arrange(Node, size) %>%
sample_n(1) %>% # Choose randomly cases in which there is the same number of node types
ungroup()
nodes_filtered_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes_filtered.txt", sep="/")
nodes_filtered_df %>% fwrite(nodes_filtered_file)
head(nodes_filtered_df)
## # A tibble: 6 × 6
## Node DiseaseName.no.sp.char Phi_tilde Phi_name size n_phi
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 A2M breast.neoplasms U undefined 20 4
## 2 A2M breast.neoplasms U undefined 40 4
## 3 A2M breast.neoplasms g.D disease-specific 60 3
## 4 A2M breast.neoplasms g.N normal-specific 80 3
## 5 A2M breast.neoplasms g.D disease-specific 100 2
## 6 A2M breast.neoplasms g.N normal-specific 120 2
We count the number of disease genes in each category for each size:
counts_categories_empty_df = expand.grid(sort(unique(nodes_expanded_df$Phi_name)), reps)
counts_categories_empty_df = counts_categories_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(counts_categories_empty_df) = c("Phi_name", "size", "rep_D", "rep_N")
counts_categories_df = nodes_expanded_df %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
group_by(Phi_name, size, rep_D, rep_N) %>% count() %>% rename("n_disease"="n") %>% arrange(size)
counts_categories_df = counts_categories_df %>% full_join(counts_categories_empty_df, by = c("Phi_name", "size", "rep_D", "rep_N")) %>% arrange(Phi_name, size, rep_D, rep_N)
counts_categories_df$n_disease = ifelse(is.na(counts_categories_df$n_disease), 0, counts_categories_df$n_disease)
counts_categories_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_category_counts.txt", sep="/")
counts_categories_df %>% fwrite(counts_categories_file)
head(counts_categories_df)
## # A tibble: 6 × 5
## # Groups: Phi_name, size, rep_D, rep_N [6]
## Phi_name size rep_D rep_N n_disease
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 common 20 1 1 0
## 2 common 20 2 2 0
## 3 common 20 3 3 0
## 4 common 20 4 4 0
## 5 common 20 5 5 0
## 6 common 40 1 1 0
We plot the number of disease genes in each category:
custom_palette = c(
"#619CFF", #common
"#F8766D", #disease-specific
"#00BA38", #normal-specific
"#808080", #undefined
"#C77CFF" #different
)
counts_categories_df %>%
group_by(Phi_name, size) %>%
summarize(mean_disease = mean(n_disease), sd_disease = sd(n_disease), max_disease=max(n_disease), min_disease=min(n_disease)) %>%
ggplot(aes(x = size, y = mean_disease, fill = Phi_name)) +
geom_line(aes(col = Phi_name)) +
geom_ribbon(aes(ymin=min_disease, ymax=max_disease), alpha=0.2) +
xlab("Number of samples") +
ylab("Number of disease genes") +
scale_fill_manual(values = custom_palette) +
scale_color_manual(values = custom_palette) +
guides(col=guide_legend(title="Gene category"), fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_counts_disease_genes.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
We create the alluvium plot showing the flow between gene categories across sample size:
nodes_filtered_df$Phi_name <- as.factor(nodes_filtered_df$Phi_name)
nodes_filtered_df$size = as.character(nodes_filtered_df$size)
levels(nodes_filtered_df$Phi_name) = sort(unique(nodes_filtered_df$Phi_name))
nodes_filtered_df$size <- factor(nodes_filtered_df$size, levels=as.character(sort(unique(as.integer(nodes_filtered_df$size)))))
custom_palette = c(
"#619CFF", #common
"#F8766D", #disease-specific
"#00BA38", #normal-specific
"#808080", #undefined
"#C77CFF" #different
)
plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_change_disease_genes.png", sep="/")
nodes_filtered_df %>%
filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
ggplot(aes(x = size, stratum = Phi_name, alluvium = Node,
fill = Phi_name, label = Phi_name)) +
scale_fill_manual(values = custom_palette) +
scale_color_manual(values = custom_palette) +
geom_flow(stat = "alluvium", lode.guidance = "frontback") +
geom_stratum() +
xlab("Number of samples") +
ylab("Number of disease genes") +
guides(fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"),
axis.title = element_text(size = 14, face="bold"),
axis.text = element_text(size = 12),
#legend.position = "bottom",
legend.text = element_text(size = 12),
legend.title=element_text(size=14, face="bold"))
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
We check the behavior specific genes associated with breast neoplasms:
# Nodes of breast neoplasms: https://www.nationalbreastcancer.org/other-breast-cancer-genes
# https://www.cancer.org/cancer/breast-cancer/risk-and-prevention/breast-cancer-risk-factors-you-cannot-change.html#:~:text=BRCA1%20and%20BRCA2%3A%20The%20most,which%20can%20lead%20to%20cancer.
nodes_to_follow = c("BRCA1", "PALB2", "CHEK2", "CDH1", "PTEN", "STK11", "TP53") ### WE ARE MISSING BRCA2!!!!
for (important_node in nodes_to_follow){
# Filter results by important node
important_df = nodes_expanded_df %>%
filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
filter(Node == important_node) %>%
select(Phi_name, size, rep_D, rep_N) %>%
group_by(Phi_name, size) %>%
summarize(n_evidences = n()) %>%
ungroup()
# Define levels
important_df$Phi_name <- as.factor(important_df$Phi_name)
important_df$size = as.character(important_df$size)
levels(important_df$Phi_name) = sort(unique(important_df$Phi_name))
important_df$size <- factor(important_df$size, levels=as.character(sort(unique(as.integer(important_df$size)))))
# Define palette
if (levels(important_df$Phi_name) == c("disease-specific", "normal-specific", "undefined")){
personalized_palette = c("#F8766D", "#00BA38", "#808080")
} else if (levels(important_df$Phi_name) == c("normal-specific", "undefined")){
personalized_palette = c("#00BA38", "#808080")
} else if (levels(important_df$Phi_name) == c("common", "normal-specific", "undefined")){
personalized_palette = c("#619CFF", "#00BA38", "#808080")
} else if (levels(important_df$Phi_name) == c("normal-specific")){
personalized_palette = c("#00BA38")
} else {
personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080", "#C77CFF")
}
# Plot
important_node_plot = important_df %>%
ggplot(aes(fill=Phi_name, y=n_evidences, x=size)) +
geom_bar(position="fill", stat="identity", width = 0.7, col="black") +
#geom_bar(position = position_dodge(), stat="identity", col="black", width = 0.6) +
xlab("Number of samples") +
ylab("Fraction of evidences") +
scale_fill_manual(values = personalized_palette) +
guides(fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
plot_file = paste(plots_dir, paste("codina_tcga_brca_gtex_breast_important_", important_node, ".png", sep=""), sep="/")
ggsave(
plot_file,
plot=important_node_plot,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
print(important_node_plot)
}
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in levels(important_df$Phi_name) == c("normal-specific", "undefined"):
## longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("common", "normal-specific", :
## the condition has length > 1 and only the first element will be used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used
Calculate the fraction of unchanged / changed genes in each category for each sample size:
# Define output table
cols = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr", "size.curr")
change_of_category_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
# Remove levels
nodes_filtered_df$size = as.numeric(as.character(nodes_filtered_df$size))
# For each size, compare nodes with the ones of previous size
for (x in seq(sizes_list)){
size = sizes_list[x]
nodes_filtered_by_selected_size = nodes_filtered_df %>%
filter((!(is.na(DiseaseName.no.sp.char))) & (size == !!size)) %>%
select(Node, Phi_name, size)
if (x == 1){
previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
} else {
nodes_filtered_compared_by_sizes = previous_nodes_filtered %>%
inner_join(nodes_filtered_by_selected_size, by=("Node")) %>%
rename(c("Phi_name.prev"="Phi_name.x", "Phi_name.curr"="Phi_name.y", "size.prev"="size.x", "size.curr"="size.y"))
change_of_category_df = rbind(change_of_category_df, nodes_filtered_compared_by_sizes)
previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
}
}
# Detect changes
change_of_category_df$transition_type = ifelse(change_of_category_df$Phi_name.prev == change_of_category_df$Phi_name.curr, "stable", "change")
change_of_category_df$transition_name = paste(change_of_category_df$Phi_name.prev, change_of_category_df$Phi_name.curr, sep=" / ")
# Print table
change_of_category_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_changes.txt", sep="/")
change_of_category_df %>% fwrite(change_of_category_file)
head(change_of_category_df)
## Node Phi_name.prev size.prev Phi_name.curr size.curr transition_type
## 1 A2M undefined 20 undefined 40 stable
## 2 ABCA3 undefined 20 undefined 40 stable
## 3 ABCB10 undefined 20 disease-specific 40 change
## 4 ABCB8 undefined 20 undefined 40 stable
## 5 ABCC1 undefined 20 undefined 40 stable
## 6 ABL1 undefined 20 undefined 40 stable
## transition_name
## 1 undefined / undefined
## 2 undefined / undefined
## 3 undefined / disease-specific
## 4 undefined / undefined
## 5 undefined / undefined
## 6 undefined / undefined
total_stable_change_genes_df = change_of_category_df %>%
group_by(size.prev) %>%
mutate(n_total = n()) %>%
ungroup() %>%
select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>%
group_by(size.prev, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.prev', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.prev = "all"
categories_stable_change_genes_df = change_of_category_df %>%
group_by(size.prev) %>%
mutate(n_total = n()) %>%
ungroup() %>%
filter(transition_type == "stable") %>%
select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>%
group_by(Phi_name.prev, size.prev, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.prev', 'size.prev',
## 'transition_type'. You can override using the `.groups` argument.
rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
filter(transition_type == "stable") %>%
ggplot(aes(x = size.prev, y = frac_genes, col = Phi_name.prev)) +
geom_line(aes(size = Phi_name.prev)) +
xlab("Number of samples") +
ylab("Fraction of stable genes") +
scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5)) +
scale_color_manual(values = c("#CD9600", "#619CFF", "#F8766D", "#00BA38", "#808080")) +
guides(col=guide_legend(title="Gene category"), size=FALSE) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_stable_genes.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
Filter disease-gene associations by the disease “arthritis rheumatoid”:
disease_name = "arthritis.rheumatoid"
GDA_ra = GDA %>% select("DiseaseName.no.sp.char", "HGNC_Symbol") %>% filter(DiseaseName.no.sp.char == disease_name)
dim(GDA_ra)
## [1] 419 2
head(GDA_ra)
## DiseaseName.no.sp.char HGNC_Symbol
## 1: arthritis.rheumatoid ABCB1
## 2: arthritis.rheumatoid ABCC2
## 3: arthritis.rheumatoid ABCC3
## 4: arthritis.rheumatoid ABCC4
## 5: arthritis.rheumatoid ABCC5
## 6: arthritis.rheumatoid ABCG2
Compile the files of the differential co-expression analysis from CODINA:
input_dir = "/work/ccnr/j.aguirreplans/Scipher/SampleSize/differential_coexpression_analysis/complete.dataset_Whole.Blood"
result_files = list.files(input_dir)
cols = c("file_name", "size", "rep_D", "rep_N")
file_info_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for(file_name in result_files){
info_file = gsub(".txt","",file_name)
file_split1 = strsplit(info_file, split="_")[[1]]
if(file_split1[1] == "diffanalysis"){
type_analysis = file_split1[2]
if(type_analysis == "nodes"){
pval = tail(file_split1, n=1)
info_file = gsub(paste("diffanalysis_", type_analysis, "_", sep=""),"",info_file)
info_file = gsub(paste("_pval_", pval, sep=""),"",info_file)
file_split2 = strsplit(info_file, split="___")[[1]]
file_D_split = strsplit(gsub(".net","",file_split2[1]), split="_")[[1]]
file_N_split = strsplit(gsub(".net","",file_split2[2]), split="_")[[1]]
method = file_D_split[1]
size = as.numeric(file_D_split[(length(file_D_split)-2)])
r_D = as.numeric(file_D_split[(length(file_D_split))])
r_N = as.numeric(file_N_split[(length(file_N_split))])
dataset_D = file_D_split[(length(file_D_split)-4)]
dataset_N = file_N_split[(length(file_N_split)-4)]
file_info_df <- rbind(file_info_df, data.frame(file_name=file_name, size=size, rep_D=r_D, rep_N=r_N))
}
}
}
head(file_info_df)
## file_name
## 1 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_1.net___pearson_gtex_Whole.Blood_size_100_rep_1.net_pval_0.05.txt
## 2 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_2.net___pearson_gtex_Whole.Blood_size_100_rep_2.net_pval_0.05.txt
## 3 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_3.net___pearson_gtex_Whole.Blood_size_100_rep_3.net_pval_0.05.txt
## 4 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_4.net___pearson_gtex_Whole.Blood_size_100_rep_4.net_pval_0.05.txt
## 5 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_5.net___pearson_gtex_Whole.Blood_size_100_rep_5.net_pval_0.05.txt
## 6 diffanalysis_nodes_pearson_scipher_complete.dataset_size_120_rep_1.net___pearson_gtex_Whole.Blood_size_120_rep_1.net_pval_0.05.txt
## size rep_D rep_N
## 1 100 1 1
## 2 100 2 2
## 3 100 3 3
## 4 100 4 4
## 5 100 5 5
## 6 120 1 1
Read the files of the differential co-expression analysis from CODINA classifying the nodes, and put them together in a single table:
pval_correction_field = "pval_Phi_Tilde.adj.fdr"
pval_threshold = 0.05
cols = c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N")
nodes_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
sizes_list = sort(unique(file_info_df$size))
for(x in seq(sizes_list)){
size = sizes_list[x]
#print(size)
same_size_files_df = file_info_df %>% filter(size==!!size) %>% arrange(rep_D, rep_N)
input_files = same_size_files_df$file_name
# same_size_info_df = data.frame()
for(input_file in input_files){
r_D = (same_size_files_df %>% filter(file_name == input_file))$rep_D
r_N = (same_size_files_df %>% filter(file_name == input_file))$rep_N
individual_df = fread(paste(input_dir, input_file, sep="/"))
node_results_filt_df = individual_df %>%
select(Node, Phi_tilde, !!as.symbol(pval_correction_field)) %>%
unique() %>%
rename("pval" = !!as.symbol(pval_correction_field)) %>%
left_join(GDA_ra, by=c("Node"="HGNC_Symbol"))
if(nrow(nodes_df) == 0){
nodes_df = cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N))
} else {
nodes_df = nodes_df %>% full_join(cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N)), by=c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N"))
}
}
}
# Define Phi_name
nodes_df$Phi_name = ifelse(nodes_df$Phi_tilde == "a", "common", ifelse(nodes_df$Phi_tilde == "b.D", "different", ifelse(nodes_df$Phi_tilde == "g.D", "disease-specific", ifelse(nodes_df$Phi_tilde == "g.N", "normal-specific", "undefined"))))
head(nodes_df)
## Node DiseaseName.no.sp.char Phi_tilde pval size rep_D rep_N
## 1: A2M <NA> g.N 2.485035e-81 20 1 1
## 2: A3GALT2 <NA> g.N 2.215557e-07 20 1 1
## 3: AAAS <NA> g.N 1.645289e-29 20 1 1
## 4: AACS <NA> g.N 2.606093e-03 20 1 1
## 5: AAGAB <NA> g.D 1.194946e-52 20 1 1
## 6: AAK1 <NA> g.D 6.865455e-19 20 1 1
## Phi_name
## 1: normal-specific
## 2: normal-specific
## 3: normal-specific
## 4: normal-specific
## 5: disease-specific
## 6: disease-specific
We consider as “undefined” both the genes that are classified as U, but also the ones that are not significantly classified in any of the categories. To do so, we need create an empty table with all considered genes for all sizes and repetitions, and fill it with the data that we already have:
reps = sort(unique(paste(nodes_df$size, nodes_df$rep_D, nodes_df$rep_N, sep="-")))
nodes_empty_df = expand.grid(sort(unique(nodes_df$Node)), reps)
nodes_empty_df = nodes_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(nodes_empty_df) = c("Node", "size", "rep_D", "rep_N")
nodes_empty_df = nodes_empty_df %>% left_join((nodes_df %>% select(Node, DiseaseName.no.sp.char) %>% unique()), by="Node") %>% arrange(Node, size, rep_D, rep_N)
nodes_expanded_df = nodes_df %>% full_join(nodes_empty_df, by = c("Node", "DiseaseName.no.sp.char", "size", "rep_D", "rep_N")) %>% arrange(Node, size, rep_D)
nodes_expanded_df = nodes_expanded_df %>%
mutate(Phi_tilde = replace(Phi_tilde, is.na(Phi_tilde), "U")) %>%
mutate(Phi_name = replace(Phi_name, is.na(Phi_name), "undefined"))
nodes_expanded_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes.txt", sep="/")
nodes_expanded_df %>% fwrite(nodes_expanded_file)
head(nodes_expanded_df)
## Node DiseaseName.no.sp.char Phi_tilde pval size rep_D rep_N
## 1: A1BG-AS1 <NA> U NA 20 1 1
## 2: A1BG-AS1 <NA> U NA 20 2 2
## 3: A1BG-AS1 <NA> U 3.696587e-01 20 3 3
## 4: A1BG-AS1 <NA> U NA 20 4 4
## 5: A1BG-AS1 <NA> U NA 20 5 5
## 6: A1BG-AS1 <NA> g.N 4.971565e-50 40 1 1
## Phi_name
## 1: undefined
## 2: undefined
## 3: undefined
## 4: undefined
## 5: undefined
## 6: normal-specific
We count how many times each gene category appears in each sample size:
# Count for each gene and size the number of times that each category is predicted
nodes_counted_df = nodes_expanded_df %>%
group_by(Node, DiseaseName.no.sp.char, Phi_tilde, Phi_name, size) %>%
summarize(n_phi = n()) %>%
ungroup() %>%
arrange(Node, size)
## `summarise()` has grouped output by 'Node', 'DiseaseName.no.sp.char',
## 'Phi_tilde', 'Phi_name'. You can override using the `.groups` argument.
nodes_counted_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes_counted.txt", sep="/")
nodes_counted_df %>% fwrite(nodes_counted_file)
head(nodes_counted_df)
## # A tibble: 6 × 6
## Node DiseaseName.no.sp.char Phi_tilde Phi_name size n_phi
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 A1BG-AS1 <NA> U undefined 20 5
## 2 A1BG-AS1 <NA> g.D disease-specific 40 1
## 3 A1BG-AS1 <NA> g.N normal-specific 40 4
## 4 A1BG-AS1 <NA> g.D disease-specific 60 2
## 5 A1BG-AS1 <NA> g.N normal-specific 60 3
## 6 A1BG-AS1 <NA> a common 80 1
We select the gene category that appears more for in the different repetitions of each sample size:
# Select the for each gene and size the node type that occurs more times in the different repetitions
nodes_filtered_df = nodes_counted_df %>%
group_by(Node, DiseaseName.no.sp.char, size) %>%
filter(n_phi == max(n_phi)) %>%
arrange(Node, size) %>%
sample_n(1) %>% # Choose randomly cases in which there is the same number of node types
ungroup()
nodes_filtered_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes_filtered.txt", sep="/")
nodes_filtered_df %>% fwrite(nodes_filtered_file)
head(nodes_filtered_df)
## # A tibble: 6 × 6
## Node DiseaseName.no.sp.char Phi_tilde Phi_name size n_phi
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 A1BG-AS1 <NA> U undefined 20 5
## 2 A1BG-AS1 <NA> g.N normal-specific 40 4
## 3 A1BG-AS1 <NA> g.N normal-specific 60 3
## 4 A1BG-AS1 <NA> g.D disease-specific 80 3
## 5 A1BG-AS1 <NA> g.D disease-specific 100 2
## 6 A1BG-AS1 <NA> g.D disease-specific 120 2
We count the number of disease genes in each category for each size:
counts_categories_empty_df = expand.grid(sort(unique(nodes_expanded_df$Phi_name)), reps)
counts_categories_empty_df = counts_categories_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(counts_categories_empty_df) = c("Phi_name", "size", "rep_D", "rep_N")
counts_categories_df = nodes_expanded_df %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
group_by(Phi_name, size, rep_D, rep_N) %>% count() %>% rename("n_disease"="n") %>% arrange(size)
counts_categories_df = counts_categories_df %>% full_join(counts_categories_empty_df, by = c("Phi_name", "size", "rep_D", "rep_N")) %>% arrange(Phi_name, size, rep_D, rep_N)
counts_categories_df$n_disease = ifelse(is.na(counts_categories_df$n_disease), 0, counts_categories_df$n_disease)
counts_categories_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_category_counts.txt", sep="/")
counts_categories_df %>% fwrite(counts_categories_file)
head(counts_categories_df)
## # A tibble: 6 × 5
## # Groups: Phi_name, size, rep_D, rep_N [6]
## Phi_name size rep_D rep_N n_disease
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 common 20 1 1 0
## 2 common 20 2 2 0
## 3 common 20 3 3 0
## 4 common 20 4 4 0
## 5 common 20 5 5 0
## 6 common 40 1 1 1
We plot the number of disease genes in each category:
custom_palette = c(
"#619CFF", #common
"#C77CFF", #different
"#F8766D", #disease-specific
"#00BA38", #normal-specific
"#808080" #undefined
)
counts_categories_df %>%
group_by(Phi_name, size) %>%
summarize(mean_disease = mean(n_disease), sd_disease = sd(n_disease), max_disease=max(n_disease), min_disease=min(n_disease)) %>%
ggplot(aes(x = size, y = mean_disease, fill = Phi_name)) +
geom_line(aes(col = Phi_name)) +
geom_ribbon(aes(ymin=min_disease, ymax=max_disease), alpha=0.2) +
xlab("Number of samples") +
ylab("Number of disease genes") +
scale_fill_manual(values = custom_palette) +
scale_color_manual(values = custom_palette) +
guides(col=guide_legend(title="Gene category"), fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_counts_disease_genes.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
We create the alluvium plot showing the flow between gene categories across sample size:
nodes_filtered_df$Phi_name <- as.factor(nodes_filtered_df$Phi_name)
nodes_filtered_df$size = as.character(nodes_filtered_df$size)
levels(nodes_filtered_df$Phi_name) = sort(unique(nodes_filtered_df$Phi_name))
nodes_filtered_df$size <- factor(nodes_filtered_df$size, levels=as.character(sort(unique(as.integer(nodes_filtered_df$size)))))
custom_palette = c(
"#619CFF", #common
"#F8766D", #disease-specific
"#00BA38", #normal-specific
"#808080", #undefined
"#C77CFF" #different
)
plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_change_disease_genes.png", sep="/")
nodes_filtered_df %>%
filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
ggplot(aes(x = size, stratum = Phi_name, alluvium = Node,
fill = Phi_name, label = Phi_name)) +
scale_fill_manual(values = custom_palette) +
scale_color_manual(values = custom_palette) +
geom_flow(stat = "alluvium", lode.guidance = "frontback") +
geom_stratum() +
xlab("Number of samples") +
ylab("Number of disease genes") +
guides(fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"),
axis.title = element_text(size = 14, face="bold"),
axis.text = element_text(size = 12),
#legend.position = "bottom",
legend.text = element_text(size = 12),
legend.title=element_text(size=14, face="bold"))
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
We check the behavior specific genes associated with rheumatoid arthritis:
# Get only strong evidences: "TNF", "IL6ST", "IL6R", "PTPN22", "HOTAIR", "MALAT1"
nodes_to_follow = c("IL6ST", "IL6R", "PTPN22", "HOTAIR", "MALAT1") #### !!!! MISSING TNF !!!!
#nodes_to_follow = c("HLA-DRB1", "HLA-DPB1", "IRF5", "PTPN22", "RBPJ", "RUNX1", "STAT4", "TNF")
for (important_node in nodes_to_follow){
# Filter results by important node
important_df = nodes_expanded_df %>%
filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
filter(!(is.na(DiseaseName.no.sp.char))) %>%
filter(Node == important_node) %>%
select(Phi_name, size, rep_D, rep_N) %>%
group_by(Phi_name, size) %>%
summarize(n_evidences = n()) %>%
ungroup()
# Define levels
important_df$Phi_name <- as.factor(important_df$Phi_name)
important_df$size = as.character(important_df$size)
levels(important_df$Phi_name) = sort(unique(important_df$Phi_name))
important_df$size <- factor(important_df$size, levels=as.character(sort(unique(as.integer(important_df$size)))))
# Define palette
if ((length(levels(important_df$Phi_name)) == length(c("disease-specific", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("disease-specific", "normal-specific", "undefined")))){
personalized_palette = c("#F8766D", "#00BA38", "#808080")
} else if ((length(levels(important_df$Phi_name)) == length(c("normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("normal-specific", "undefined")))){
personalized_palette = c("#00BA38", "#808080")
} else if ((length(levels(important_df$Phi_name)) == length(c("common", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("common", "normal-specific", "undefined")))){
personalized_palette = c("#619CFF", "#00BA38", "#808080")
} else if ((length(levels(important_df$Phi_name)) == length(c("normal-specific"))) && (all(levels(important_df$Phi_name) == c("normal-specific")))){
personalized_palette = c("#00BA38")
} else if ((length(levels(important_df$Phi_name)) == length(c("common", "disease-specific", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("common", "disease-specific", "normal-specific", "undefined")))){
personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080")
} else {
personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080", "#C77CFF")
}
# Plot
important_node_plot = important_df %>%
ggplot(aes(fill=Phi_name, y=n_evidences, x=size)) +
geom_bar(position="fill", stat="identity", width = 0.7, col="black") +
#geom_bar(position = position_dodge(), stat="identity", col="black", width = 0.6) +
xlab("Number of samples") +
ylab("Fraction of evidences") +
scale_fill_manual(values = personalized_palette) +
guides(fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
plot_file = paste(plots_dir, paste("codina_scipher_ra_gtex_wholeblood_important_", important_node, ".png", sep=""), sep="/")
ggsave(
plot_file,
plot=important_node_plot,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
print(important_node_plot)
}
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.`summarise()` has grouped output by 'Phi_name'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
Calculate the fraction of unchanged / changed genes in each category for each sample size:
# Define output table
cols = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr", "size.curr")
change_of_category_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
# Remove levels
nodes_filtered_df$size = as.numeric(as.character(nodes_filtered_df$size))
# For each size, compare nodes with the ones of previous size
for (x in seq(sizes_list)){
size = sizes_list[x]
nodes_filtered_by_selected_size = nodes_filtered_df %>%
filter((!(is.na(DiseaseName.no.sp.char))) & (size == !!size)) %>%
select(Node, Phi_name, size)
if (x == 1){
previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
} else {
nodes_filtered_compared_by_sizes = previous_nodes_filtered %>%
inner_join(nodes_filtered_by_selected_size, by=("Node")) %>%
rename(c("Phi_name.prev"="Phi_name.x", "Phi_name.curr"="Phi_name.y", "size.prev"="size.x", "size.curr"="size.y"))
change_of_category_df = rbind(change_of_category_df, nodes_filtered_compared_by_sizes)
previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
}
}
# Detect changes
change_of_category_df$transition_type = ifelse(change_of_category_df$Phi_name.prev == change_of_category_df$Phi_name.curr, "stable", "change")
change_of_category_df$transition_name = paste(change_of_category_df$Phi_name.prev, change_of_category_df$Phi_name.curr, sep=" / ")
# Print table
change_of_category_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_changes.txt", sep="/")
change_of_category_df %>% fwrite(change_of_category_file)
head(change_of_category_df)
## Node Phi_name.prev size.prev Phi_name.curr size.curr transition_type
## 1 ABCB1 normal-specific 20 normal-specific 40 stable
## 2 ABCC2 undefined 20 normal-specific 40 change
## 3 ABCC3 undefined 20 undefined 40 stable
## 4 ABCC4 undefined 20 disease-specific 40 change
## 5 ABCC5 undefined 20 normal-specific 40 change
## 6 ABCG2 undefined 20 disease-specific 40 change
## transition_name
## 1 normal-specific / normal-specific
## 2 undefined / normal-specific
## 3 undefined / undefined
## 4 undefined / disease-specific
## 5 undefined / normal-specific
## 6 undefined / disease-specific
total_stable_change_genes_df = change_of_category_df %>%
group_by(size.prev) %>%
mutate(n_total = n()) %>%
ungroup() %>%
select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>%
group_by(size.prev, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.prev', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.prev = "all"
categories_stable_change_genes_df = change_of_category_df %>%
group_by(size.prev) %>%
mutate(n_total = n()) %>%
ungroup() %>%
filter(transition_type == "stable") %>%
select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>%
group_by(Phi_name.prev, size.prev, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.prev', 'size.prev',
## 'transition_type'. You can override using the `.groups` argument.
rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
filter(transition_type == "stable") %>%
ggplot(aes(x = size.prev, y = frac_genes, col = Phi_name.prev)) +
geom_line(aes(size = Phi_name.prev)) +
xlab("Number of samples") +
ylab("Fraction of stable genes") +
scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5)) +
scale_color_manual(values = c("#CD9600", "#619CFF", "#F8766D", "#00BA38", "#808080")) +
guides(col=guide_legend(title="Gene category"), size=FALSE) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_stable_genes.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)